suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(RColorBrewer)
library(tidyverse)
library(VennDiagram)
})
suppressMessages({
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
source(here("Rtoolbox/src/plotEnrichedTreemap.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id,gene_name=character(0)){
message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
d <- bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,]))
p <- ggplot(d,
aes(x=TIME,y=value,col=TREATMENT)) +
facet_wrap(.~TISSUE) +
geom_point() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id,ifelse(length(gene_name)>0,gene_name,"")))
suppressMessages(suppressWarnings(plot(p)))
p <- ggplot(d,
aes(x=CONDITION,y=value,col=TISSUE,group=TISSUE)) +
geom_point() + geom_smooth() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id,ifelse(length(gene_name)>0,gene_name,"")))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}
"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE){
if(length(contrast)==1){
res <- results(dds,name=contrast)
} else {
res <- results(dds,contrast=contrast)
}
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
}
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
if(debug){
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE genes"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE genes per interval"))
}
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE genes per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) & res$baseMean >= expression_cutoff
if(verbose){
message(sprintf("There are %s genes that are DE with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s",
sum(sel),
padj,lfc,expression_cutoff))
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
}
if(plot){
if(sum(sel) > 1){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel])
} else {
warning("There are not enough DE genes to create a heatmap")
}
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
extractEnrichmentResults <- function(enrichment,task="go",
diff.exp=c("all","up","dn"),
go.namespace=c("BP","CC","MF"),
genes=NULL,export=TRUE,plot=TRUE,
default_dir=here("data/analysis/DE"),
default_prefix="DE",
url="athaliana"){
# process args
diff.exp <- match.arg(diff.exp)
de <- switch (diff.exp,
all = "none",
up = "up",
dn = "down"
)
# write out
if(export){
write_tsv(enrichment[[task]],
path=here(default_dir,
paste0(default_prefix,"-genes_GO-enrichment.tsv")))
if(!is.null(genes)){
write_tsv(
enrichedTermToGenes(genes=genes,terms=enrichment[[task]]$id,url=url,mc.cores=16L),
path=here(default_dir,
paste0(default_prefix,"-enriched-term-to-genes.tsv"))
)
}
}
if(plot){
sapply(go.namespace,function(ns){
titles <- c(BP="Biological Process",
CC="Cellular Component",
MF="Molecular Function")
suppressWarnings(tryCatch({plotEnrichedTreemap(enrichment,enrichment=task,
namespace=ns,
de=de,title=titles[ns])},
error = function(e) {
message(paste("Treemap plot failed for",ns,
"because of:",e))
return(NULL)
}))
})
}
}
load(here("data/analysis/salmon/dds.rda"))
vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/salmon-vst-aware.rda"))
goi <- read_csv(here("doc/Candidates_and_flowering_genes.csv"),col_types=cols(.default=col_character()))
stopifnot(all(goi$ID %in% rownames(vst)))
dev.null <- apply(goi,1,function(ro){
line_plot(dds=dds,vst=vst,
gene_id = ro[2],
gene_name = ro[1])})
## Plotting AT2G47180
## Plotting AT1G56600
## Plotting AT1G09350
## Plotting AT1G60470
## Plotting AT1G60450
## Plotting AT3G28340
## Plotting AT1G55740
## Plotting AT3G57520
## Plotting AT4G01970
## Plotting AT5G40390
## Plotting AT5G20250
## Plotting AT3G06580
## Plotting AT1G65480
## Plotting AT4G20370
## Plotting AT1G22770
## Plotting AT2G45660
## Plotting AT4G35900
## Plotting AT5G15840
## Plotting AT5G61850
## Plotting AT1G28130
## Plotting AT4G27260
## Plotting AT5G03840
ddsApex <- dds[,dds$TISSUE=="APEX"]
ddsApex <- cbind(ddsApex,ddsApex[,ddsApex$TREATMENT=="Na"])
ddsApex$TREATMENT[ddsApex$TREATMENT=="Na"] <- rep(c("Mock","Dexa"),each=3)
ddsApex$TREATMENT <- droplevels(ddsApex$TREATMENT)
design(ddsApex) <- ~ TIME * TREATMENT
ddsApex <- DESeq(ddsApex)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Update the vst for convenience
vsdApex <- varianceStabilizingTransformation(ddsApex,blind=FALSE)
vstApex <- assay(vsdApex)
vstApex <- vstApex - min(vstApex)
plotDispEsts(ddsApex)
Check the different contrasts
resultsNames(ddsApex)
## [1] "Intercept" "TIME_T1_vs_T0" "TIME_T3_vs_T0"
## [4] "TREATMENT_Dexa_vs_Mock" "TIMET1.TREATMENTDexa" "TIMET3.TREATMENTDexa"
#’ ```{r res, echo=FALSE,eval=FALSE} CHANGEME - here you need to define the contrast you want to study - see the example in the next block.
The contrast can be given by name, as a list (numerator/denominator) or as a vector of weight (e.g. c(0,1)); read the DESeq2 vignette for more info
The label argument is typically one (or a combination) of the metadata stored in colData
The function allows for looking at the independent filtering results using debug=TRUE
If you are not satisfied with the default from DESeq2, you can set your own cutoff using expression_cutoff
You can change the default output file prefix using default_prefix
You can select the set of samples to be added to the heatmap, using the sample_sel argument. It takes a logical vector.
```r
AdexaT1 <- extract_results(ddsApex,vstApex,
contrast = "TIMET1.TREATMENTDexa",
default_prefix = "Salmon-Apex_Dexa-vs-Mock_T1_",
labels = ddsApex$TREATMENT,
sample_sel = ddsApex$TIME=="T1"
)
## Loading required package: LSD
## The independent filtering cutoff is 3.2341, removing 35.30366% of the data
## The independent filtering cutoff is 3.2341, removing 35.30366% of the data
## The independent filtering maximises for 39.89569 % of the data, corresponding to a base mean expression of 9.81298 (library-size normalised read)
## There are 381 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
AdexaT3 <- extract_results(ddsApex,vstApex,
contrast = "TIMET3.TREATMENTDexa",
default_prefix = "Salmon-Apex_Dexa-vs-Mock_T3_",
labels = ddsApex$TREATMENT,
sample_sel = ddsApex$TIME=="T3")
## The independent filtering cutoff is 7.50032, removing 38.36501% of the data
## The independent filtering cutoff is 7.50032, removing 38.36501% of the data
## The independent filtering maximises for 39.89569 % of the data, corresponding to a base mean expression of 9.81298 (library-size normalised read)
## There are 392 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
res.list <- list(AdexaT1=AdexaT1,
AdexaT3=AdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
ddsLeaf <- dds[,dds$TISSUE=="LEAF"]
ddsLeaf <- cbind(ddsLeaf,ddsLeaf[,ddsLeaf$TREATMENT=="Na"])
ddsLeaf$TREATMENT[ddsLeaf$TREATMENT=="Na"] <- rep(c("Mock","Dexa"),each=4)
ddsLeaf$TREATMENT <- droplevels(ddsLeaf$TREATMENT)
design(ddsLeaf) <- ~ TIME * TREATMENT
ddsLeaf <- DESeq(ddsLeaf)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Update the vst for convenience
vsdLeaf <- varianceStabilizingTransformation(ddsLeaf,blind=FALSE)
vstLeaf <- assay(vsdLeaf)
vstLeaf <- vstLeaf - min(vstLeaf)
plotDispEsts(ddsLeaf)
Check the different contrasts
resultsNames(ddsLeaf)
## [1] "Intercept" "TIME_T1_vs_T0" "TIME_T3_vs_T0"
## [4] "TREATMENT_Dexa_vs_Mock" "TIMET1.TREATMENTDexa" "TIMET3.TREATMENTDexa"
#’ ```{r res, echo=FALSE,eval=FALSE} CHANGEME - here you need to define the contrast you want to study - see the example in the next block.
The contrast can be given by name, as a list (numerator/denominator) or as a vector of weight (e.g. c(0,1)); read the DESeq2 vignette for more info
The label argument is typically one (or a combination) of the metadata stored in colData
The function allows for looking at the independent filtering results using debug=TRUE
If you are not satisfied with the default from DESeq2, you can set your own cutoff using expression_cutoff
You can change the default output file prefix using default_prefix
You can select the set of samples to be added to the heatmap, using the sample_sel argument. It takes a logical vector.
```r
LdexaT1 <- extract_results(ddsLeaf,vstLeaf,
contrast = "TIMET1.TREATMENTDexa",
default_prefix = "Salmon-Leaf_Dexa-vs-Mock_T1_",
labels = ddsLeaf$TREATMENT,
sample_sel = ddsLeaf$TIME=="T1")
## The independent filtering cutoff is 0.83397, removing 34.49993% of the data
## The independent filtering cutoff is 0.83397, removing 34.49993% of the data
## The independent filtering maximises for 40.26184 % of the data, corresponding to a base mean expression of 3.76319 (library-size normalised read)
## There are 5 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
LdexaT3 <- extract_results(ddsLeaf,vstLeaf,
contrast = "TIMET3.TREATMENTDexa",
default_prefix = "Salmon-Leaf_Dexa-vs-Mock_T3_",
labels = ddsLeaf$TREATMENT,
sample_sel = ddsLeaf$TIME=="T3")
## The independent filtering cutoff is 0.02561, removing 24.41659% of the data
## The independent filtering cutoff is 0.02561, removing 24.41659% of the data
## The independent filtering maximises for 24.41659 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 1 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
## Warning in extract_results(ddsLeaf, vstLeaf, contrast =
## "TIMET3.TREATMENTDexa", : There are not enough DE genes to create a heatmap
res.list <- list(LdexaT1=LdexaT1,
LdexaT3=LdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:2]))
res.list <- list(AdexaT1=AdexaT1,
AdexaT3=AdexaT3,
LdexaT1=LdexaT1,
LdexaT3=LdexaT3)
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","all"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","up"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
grid.newpage()
grid.draw(venn.diagram(x=lapply(res.list,"[[","dn"),
filename=NULL,category.names=names(res.list),fill=pal[1:4]))
background <- rownames(vst)[featureSelect(vst,dds$CONDITION,exp=0.2)]
stopifnot(all(unlist(res.list) %in% background))
There are too few genes to generate an enrichment for the leaf, so we focus on the appices comparisons
enr.list <- lapply(res.list[1:2],function(r){
lapply(r,gopher,background=background,task="go",url="athaliana")
})
dev.null <- lapply(names(enr.list),function(n){
message(n)
lapply(names(enr.list[[n]]),function(de){
message(de)
extractEnrichmentResults(enr.list[[n]][[de]],
diff.exp=de,
genes=res.list[[n]][[de]],
default_prefix=paste(n,de,sep="-"))
})
})
## AdexaT1
## all
## Warning: The `path` argument of `write_tsv()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Loading required package: treemap
## up
## dn
## Treemap plot failed for CC because of: Error in treemap(enrData, index = index, vSize = vSize, palette = palette, : data.frame doesn't have any rows
## AdexaT3
## all
## up
## dn
## Treemap plot failed for CC because of: Error in treemap(enrData, index = index, vSize = vSize, palette = palette, : data.frame doesn't have any rows
## Treemap plot failed for MF because of: Error in treemap(enrData, index = index, vSize = vSize, palette = palette, : data.frame doesn't have any rows
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] treemap_2.4-2 GO.db_3.12.1
## [3] AnnotationDbi_1.52.0 jsonlite_1.7.2
## [5] LSD_4.1-0 limma_3.46.0
## [7] VennDiagram_1.6.20 futile.logger_1.4.3
## [9] forcats_0.5.0 stringr_1.4.0
## [11] dplyr_1.0.3 purrr_0.3.4
## [13] readr_1.4.0 tidyr_1.1.2
## [15] tibble_3.0.5 tidyverse_1.3.0
## [17] RColorBrewer_1.1-2 hyperSpec_0.99-20201127
## [19] xml2_1.3.2 ggplot2_3.3.3
## [21] lattice_0.20-41 here_1.0.1
## [23] gplots_3.1.1 DESeq2_1.30.0
## [25] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [27] MatrixGenerics_1.2.0 matrixStats_0.57.0
## [29] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [31] IRanges_2.24.1 S4Vectors_0.28.1
## [33] BiocGenerics_0.36.0 data.table_1.13.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1 rprojroot_2.0.2
## [4] XVector_0.30.0 fs_1.5.0 rstudioapi_0.13
## [7] farver_2.0.3 bit64_4.0.5 fansi_0.4.2
## [10] lubridate_1.7.9.2 splines_4.0.3 geneplotter_1.68.0
## [13] knitr_1.30 gridBase_0.4-7 broom_0.7.3
## [16] annotate_1.68.0 dbplyr_2.0.0 png_0.1-7
## [19] shiny_1.5.0 compiler_4.0.3 httr_1.4.2
## [22] backports_1.2.1 fastmap_1.0.1 assertthat_0.2.1
## [25] Matrix_1.3-2 lazyeval_0.2.2 cli_2.2.0
## [28] later_1.1.0.1 formatR_1.7 htmltools_0.5.1
## [31] tools_4.0.3 igraph_1.2.6 gtable_0.3.0
## [34] glue_1.4.2 GenomeInfoDbData_1.2.4 Rcpp_1.0.6
## [37] cellranger_1.1.0 vctrs_0.3.6 nlme_3.1-151
## [40] xfun_0.20 testthat_3.0.1 rvest_0.3.6
## [43] mime_0.9 lifecycle_0.2.0 gtools_3.8.2
## [46] XML_3.99-0.5 zlibbioc_1.36.0 scales_1.1.1
## [49] promises_1.1.1 hms_1.0.0 lambda.r_1.2.4
## [52] curl_4.3 yaml_2.2.1 memoise_1.1.0
## [55] latticeExtra_0.6-29 stringi_1.5.3 RSQLite_2.2.2
## [58] highr_0.8 genefilter_1.72.0 caTools_1.18.1
## [61] BiocParallel_1.24.1 rlang_0.4.10 pkgconfig_2.0.3
## [64] bitops_1.0-6 evaluate_0.14 labeling_0.4.2
## [67] bit_4.0.4 tidyselect_1.1.0 magrittr_2.0.1
## [70] R6_2.5.0 generics_0.1.0 DelayedArray_0.16.0
## [73] DBI_1.1.1 mgcv_1.8-33 pillar_1.4.7
## [76] haven_2.3.1 withr_2.4.0 survival_3.2-7
## [79] RCurl_1.98-1.2 modelr_0.1.8 crayon_1.3.4
## [82] futile.options_1.0.1 KernSmooth_2.23-18 rmarkdown_2.6
## [85] jpeg_0.1-8.1 locfit_1.5-9.4 readxl_1.3.1
## [88] blob_1.2.1 reprex_0.3.0 digest_0.6.27
## [91] xtable_1.8-4 httpuv_1.5.5 munsell_0.5.0